################################################################################################
## Replication File for 
## "Capturing Clicks: How the Chinese Government Uses Clickbait to Compete for Visibility"
## Yingdan Lu and Jennifer Pan
## April, 2020
################################################################################################

################################################################################################
## Clickbait, clicks and visibility
################################################################################################

######### Set Up #########
#install.packages("lubridate")
library(lubridate) #version 1.7.4
#install.packages("ggplot2")
library(ggplot2) #version 3.2.1
#install.packages("grid")
library(grid) #version 3.6.0
#install.packages("gridExtra")
library(gridExtra) #version 2.3
#install.packages("remotes")
# library(remotes)
# install_github("IQSS/Zelig")
library(Zelig) #version 5.1.6.1
#install.packages("MASS")
library(MASS) #version 7.3-51.4
# install.packages("R2admb")
# install.packages("glmmADMB",
#                  repos=c("http://glmmadmb.r-forge.r-project.org/repos",
#                          getOption("repos")),type="source")
library(glmmADMB) #version 0.8.3.3
#install.packages("texreg")
library(texreg) #version 1.36.23
#install.packages("extrafont")
library(extrafont) #version 0.17
#install.packages("dplyr")
library(dplyr) #version 0.8.1

# load fonts
loadfonts()

setwd("..")

# import the sample dataset
dt <- read.csv("data/sample_posts.csv", header = T, encoding = "UTF-8", stringsAsFactors = F)
city_data <- read.csv("data/city_data.csv", header = T, encoding = "UTF-8", stringsAsFactors = F)
dt_total <- read.csv("data/total_posts.csv", header = T, encoding = "UTF-8", stringsAsFactors = F)

# preprocessing the sample data
dt$date_pek <- mdy(dt$date_pek)
dt$excl <- ifelse(dt$excl_mark >= 1,1,0)
dt$que <- ifelse(dt$question_mark >= 1,1,0)
dt$ell <- ifelse(dt$ellipsis_mark >= 1,1,0)
dt$pron <- ifelse(dt$pronoun_num >= 1,1,0)
dt$phrase <- ifelse(dt$phrases_num>= 1,1,0)

# select data that have likes information and after the wechat upgrades
dt_likes <- dt[!is.na(dt$likes) & dt$date_pek >= "2019-03-14",]

# subsets all data that have reads information
dt_reads <- dt[!is.na(dt$reads),]

# add account factor variable for both datasets
dt_likes$account_factor <- as.factor(dt_likes$account_name)
dt_reads$account_factor <- as.factor(dt_reads$account_name)

# log the reads for further computation
dt_likes$logreads <- log(dt_likes$reads)


######### Table 1 ###########
# construct the negative binomial models with reads as DV
##model 1: clickbait indicators with city fixed effects
reg1 <- zelig(reads ~ hyperbolic+excl+ell+phrase+listicles+que+pron+slang+gennn+
               account_factor, model = "negbin", data = dt_reads)
##model 2: clickbait indicators, appeal indicators with city fixed effects
reg2 <- zelig(reads ~ hyperbolic+excl+ell+phrase+listicles+que+pron+slang+gennn+
                pride+joy+anger+fear+warmth+vision+account_factor,
              model = "negbin", data = dt_reads)
##model 3: clickbait indicators, appeal indicators, topic indicators with city fixed effects
reg3 <- zelig(reads ~ hyperbolic+excl+ell+phrase+listicles+que+pron+slang+gennn+
                pride+joy+anger+fear+warmth+vision+
                ideo+job+gov+leader+fame+life_guidance+
                account_factor,model = "negbin", data = dt_reads)

# output the model statistics
summary(reg1)
summary(reg2)
summary(reg3)

# simulation for the quantity of interest for hyperbolic variable
set.seed(120)
reg3 <- zelig(reads ~ ideo+job+gov+leader+fame+life_guidance+
                listicles+gennn+hyperbolic+slang+excl+que+
                ell+phrase+pron+joy+pride+anger+fear+warmth+vision+
                account_factor,model = "negbin", data = dt_reads)

hyper.read.sim <- reg3 %>%
  setx(hyperbolic = 0) %>%
  setx1(hyperbolic = 1) %>%
  sim(n=100000) 

reg3.x0 <- hyper.read.sim$get_qi(qi="ev", xvalue="x")
reg3.hyper0.ev <- c(mean(reg3.x0), quantile(reg3.x0, .025), quantile(reg3.x0, .975))
reg3.x1 <- hyper.read.sim$get_qi(qi="ev", xvalue="x1")
reg3.hyper1.ev <- c(mean(reg3.x1), quantile(reg3.x1, .025), quantile(reg3.x1, .975))

hyper.read.out <- hyper.read.sim$get_qi(qi="fd", xvalue="x1")
hyper.read.fd <- c(mean(hyper.read.out), quantile(hyper.read.out, .025),
                   quantile(hyper.read.out, .975))

# simulation for the quantity of interest exclamation mark variable
reg3 <- zelig(reads ~ ideo+job+gov+leader+fame+life_guidance+
                listicles+gennn+hyperbolic+slang+excl+que+
                ell+phrase+pron+joy+pride+anger+fear+warmth+vision+
                account_factor,model = "negbin", data = dt_reads)
set.seed(120)
excl.read.sim <- reg3 %>%
  setx(excl = 0) %>%
  setx1(excl = 1) %>%
  sim(n=100000)

reg3.x0 <- excl.read.sim$get_qi(qi="ev", xvalue="x")
reg3.excl0.ev <- c(mean(reg3.x0), quantile(reg3.x0, .025), quantile(reg3.x0, .975))
reg3.x1 <- excl.read.sim$get_qi(qi="ev", xvalue="x1")
reg3.excl1.ev <- c(mean(reg3.x1), quantile(reg3.x1, .025), quantile(reg3.x1, .975))

excl.read.out <- excl.read.sim$get_qi(qi="fd", xvalue="x1")
excl.read.fd <- c(mean(excl.read.out), quantile(excl.read.out, .025),
                   quantile(excl.read.out, .975))

######### Table 2 ###########
# construct the zero-inflated negative binomial models with likes as DV
##model 1: clickbait indicators, log reads with city fixed effects
z1 <- glmmadmb(likes ~ slang+hyperbolic+pron+que+ell+excl+gennn+
                 phrase+listicles+(1|account_factor)+logreads, family = "nbinom", 
               zeroInflation=TRUE, data = dt_likes)
##model 2: clickbait indicators, appeal indicators, log reads with city fixed effects
z2 <- glmmadmb(likes ~ slang+hyperbolic+pron+que+ell+excl+gennn+
                 phrase+listicles+warmth+anger+joy+pride+fear+vision+
                 (1|account_factor)+logreads, family = "nbinom", 
               zeroInflation=TRUE, data = dt_likes)
##model 3: clickbait indicators, appeal indicators, topic indicators, log reads with city fixed effects
z3 <- glmmadmb(likes ~ slang+hyperbolic+pron+que+ell+excl+gennn+
                 phrase+listicles+warmth+anger+joy+pride+fear+vision+
                 ideo+job+gov+leader+fame+life_guidance+
                 (1|account_factor)+logreads, family = "nbinom", zeroInflation=TRUE, 
               data = dt_likes)

# output the regression statistics
texreg(list(z1,z2,z3))

# calculate the AIC differences between models
reg3.p <- glm(reads ~ ideo+job+gov+leader+fame+life_guidance+
                listicles+gennn+hyperbolic+slang+excl+que+
                ell+phrase+pron+joy+pride+anger+fear+warmth+vision+
                account_factor, family = "poisson", data = dt_reads)
reg3.n <- glm.nb(reads ~ ideo+job+gov+leader+fame+life_guidance+
                   listicles+gennn+hyperbolic+slang+excl+que+
                   ell+phrase+pron+joy+pride+anger+fear+warmth+vision+
                   account_factor, data = dt_reads)
likes3.n <- glm.nb(likes ~ ideo+job+gov+leader+fame+life_guidance+
                     listicles+gennn+hyperbolic+slang+excl+que+
                     ell+phrase+pron+joy+pride+anger+fear+warmth+vision+logreads+
                     account_factor, data = dt_likes)

# compute and compare the AIC score of different models
AIC(reg3.p)/AIC(reg3.n)
AIC(z3)/AIC(likes3.n)


######### Table 3 #########
# preprocessing the city indicators and clickbait data
for (i in (4:11)){
  city_data[,i] <- log(city_data[,i])
}
city_data$registration <- mdy(city_data$registration)
city_data$accountdur<- abs(time_length(interval(ymd("2019-09-10"), 
                                                city_data$registration), "day"))
dt$bait <- ifelse(apply(dt[,c("listicles","gennn","hyperbolic","slang","excl_mark", "question_mark",
                              "ellipsis_mark", "pronoun_num", "phrases_num")] >= 1,1,any),1,0)
# define a function and normalize the WCI data
znorm <- function(ts){
  ts.mean <- mean(as.numeric(ts), na.rm = T)
  ts.dev <- sd(as.numeric(ts), na.rm = T)
  (as.numeric(ts) - ts.mean)/ts.dev
}
city_data$WCI <- znorm(city_data$WCI)

# average clickbait rate for each account
group_bait_city = as.data.frame(dt %>%
                                  group_by(dt$account_name)%>%
                                  summarise(rate = mean(bait)))
colnames(group_bait_city)[1] <- "account_name"

# merge two datasets
city_data_new <- merge(city_data, group_bait_city, by = "account_name")
# calculate number of posts of each account
city_data_new$post_num <- as.data.frame(dt_total%>%
                                          group_by(dt_total$account_name)%>%
                                          summarise(n()))[,2]
# construct the regression models
reg_city1 <- lm(WCI ~ rate, data = city_data_new)
reg_city2 <- lm(WCI ~ rate +accountdur+post_num, data = city_data_new)
reg_city3 <- lm(WCI ~ rate +accountdur+post_num+Population+GDP+GDPPerCap+PBExp+GrossArea+Internet.DSL+Phone+Tourism,
                data = city_data_new)

# output the results
summary(reg_city1)
summary(reg_city2)
summary(reg_city3)


######### Appendix Figure A3 ###########
# check the basic statistics of reads and likes
c(mean(dt_reads$reads), median(dt_reads$reads), var(dt_reads$reads))
c(mean(dt_likes$likes), median(dt_likes$likes), var(dt_likes$likes))

# check how many likes that are below 0
prop.table(table(dt_likes$likes == 0))*100

# replicate Figure A3
plotlist = {}
plotlist[[1]] <- ggplot() + theme_bw(base_size=12, base_family='Times New Roman')+
  geom_histogram(aes(dt_reads$reads), bins = 50)+xlab("Number of Views")+ylab("Number of titles")+
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        axis.text=element_text(size=12), )
plotlist[[2]] <- ggplot() + theme_bw(base_size=12, base_family='Times New Roman') +
  geom_histogram(aes(dt_likes$likes), bins = 50)+xlab("Number of Likes (titles posted after function update of likes on Mar 14)")+ylab("Number of titles")+
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        axis.text=element_text(size=12), )
p <- grid.arrange(grobs=plotlist,nrow=2)

